import json
import requests
from bs4 import BeautifulSoup
star_url = 'https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start={}'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'}
num = 0
data = '{"subjects":[]}'
while True:
    url = star_url.format(num)
    response = requests.get(url,headers=headers,verify=False)
    res = response.content.decode()
    if res == data:
        print('到头了，豆瓣枯了')
        break
    dict_response = json.loads(res)
    content_list = dict_response['subjects']
    for content in content_list:
        new_url = content['url']
        detail_html = requests.get(new_url,headers=headers,verify=False)
        html = detail_html.content.decode()
        soup = BeautifulSoup(html,'lxml')
        short_list = soup.find_all('span',attrs={'class':'short'})
        tag = soup.find_all('span',attrs={'class':'actor'})[0]
        actor_tag = tag.find_all('span',attrs={'class':'attrs'})[0]
        a_tags01 = actor_tag.find_all('a')[0].get_text()
        a_tags02 = actor_tag.find_all('a')[1].get_text()
        content['short'] = short_list[0].get_text()
        content['actor'] = a_tags01 + '/' + a_tags02
        content['synopsis'] = soup.find_all('div',attrs={'id':'link-report'})[0].span.get_text()
        content['synopsis'] = content['synopsis'].replace(' ',' ')
        content['short'] = content['short'].replace('\n','').replace(' ', ' ')
        with open('douban.csv','a',encoding='GBK',errors='ignore') as f:
            f.write(content['title'] + ',' + content['rate'] + ',' + content['actor'] + ','+ content['synopsis']  + ',' + content['short'] + '\n' + ' \n')
        print('爬取《'+content['title']+'》成功！')
    num += 20